{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "dd2c8c55",
   "metadata": {},
   "source": [
    "# Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "716880ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from matplotlib import pyplot as plt\n",
    "from sklearn.model_selection import train_test_split\n",
    "from scipy.io import savemat, loadmat\n",
    "import os"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4d7dc4a0",
   "metadata": {},
   "source": [
    "## Step 1:  \n",
    "convert the dataset to mat format for Matlab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "711356a2",
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset = pd.read_csv('dataset/mango/NAnderson2020MendeleyMangoNIRData.csv')\n",
    "y = dataset.DM\n",
    "x = dataset.loc[:, '684': '990']\n",
    "savemat('dataset/mango/mango_origin.mat', {'x': x.values, 'y': y.values})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3e41e8e6",
   "metadata": {},
   "source": [
    "## Step2:\n",
    "do pre-processing with Matlab.\n",
    "```matlab\n",
    "preprocess_mango.m;\n",
    "```"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ea5e54fd",
   "metadata": {},
   "source": [
    "## Step3:\n",
    "Data split with train test split."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "6eac026e",
   "metadata": {},
   "outputs": [],
   "source": [
    "data = loadmat('dataset/mango/mango_preprocessed.mat')\n",
    "x, y = data['x'], data['y']\n",
    "x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=24)\n",
    "if not os.path.exists('mango'):\n",
    "        os.makedirs('mango')\n",
    "savemat('mango/mango_dm_split.mat',{'x_train':x_train, 'y_train':y_train, 'x_test':x_test, 'y_test':y_test,\n",
    "        'max_y': data['max_y'], 'min_y': data['min_y'],\n",
    "         'min_x':data['min_x'], 'max_x':data['max_x']})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2977dae",
   "metadata": {},
   "source": [
    "Step 4: \n",
    "Show data with pictures\n",
    "use `draw_pics_origin` to draw original spectra\n",
    "![img](./raw.png)\n",
    "use `draw_pics_preprocessed.m` to draw proprecessed spectra\n",
    "![img](./preprocessed.png)"
   ]
  }
 ],
 "metadata": {
  "interpreter": {
   "hash": "7f619fc91ee8bdab81d49e7c14228037474662e3f2d607687ae505108922fa06"
  },
  "kernelspec": {
   "display_name": "Python 3.9.7 64-bit ('base': conda)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}